import numpy as np 
import pandas as pd
import re
import string
df = pd.read_csv('data/mbti_1.csv')

df['seperated_post'] = df['posts'].apply(lambda x: x.strip().split("|||"))
df['num_post'] = df['seperated_post'].apply(lambda x: len(x))
df['id'] = df.index

df1 = pd.DataFrame(df['seperated_post'].tolist(), index=df['id']).stack().reset_index(level=1, drop=True).reset_index(name='idposts')
df1 = df1.join(df.set_index('id'), on='id', how = 'left')

def clean_text(text):
    result = re.sub(r'http[^\s]*', '',text)
    result = re.sub('[0-9]+','', result).lower()
    result = re.sub('@[a-z0-9]+', 'user', result)
    result = re.sub('[%s]*' % string.punctuation, '',result)
    result=result.lower()
    return result
    
df1['idposts'] = df1['idposts'].apply(clean_text)
cleaned_df = df1.groupby('id')['idposts'].apply(list).reset_index()
df['cleaned_post'] = cleaned_df['idposts'].apply(lambda x: ' '.join(x))

newdf=pd.DataFrame()
newdf['cleaned_post']=df['cleaned_post']
newdf['type']=df['type']
newdf.to_csv('data/newmbti.csv')